Python modules used.
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as py
import datetime
import numpy as np
import math
import reverse_geocoder as rg
from scipy.stats import norm
from scipy.stats import expon
from scipy.stats import lognorm
from scipy.stats import weibull_min
from scipy.stats import weibull_max
from scipy.stats import alpha
from scipy.stats import beta
import scipy.stats as stats
%matplotlib inline
from scipy import stats
from fitter import Fitter
from IPython.display import Image
#plotly.offline.init_notebook_mode(connected=True)
py.init_notebook_mode(connected=False)
#ignore deprication warnings
import warnings
warnings.filterwarnings('ignore')
import operator
import collections
from scipy.optimize import minimize
def objfun(x,*args):
#x = params
params=x
#*args = dist, data
dist = args[0]
data = args[1]
weights = args[2]
fit = [dist.cdf(j, *params) for j in sorted(weights)]
err = checkfit(data,fit)
return err
def checkfit(data,fit):
err=0
for i in range(len(data)):
err+=(data[i]-fit[i])**2
return err
For data reports: https://www.nohrsc.noaa.gov/nsa/reports.html
For definition of terms https://www.nohrsc.noaa.gov/help/
#read in the Surface Water Equivalent Data
SWE = pd.read_csv('swe.csv')
# remove some unused columns
SWE = SWE.drop(["Unnamed: 0","Unnamed: 0.1", "Unnamed: 10", "Unnamed: 9","Zip_Code"], axis=1)
#remove the zeros
SWE = SWE[SWE.Amount>0]
# add columns for year,month,day
SWE['year'] = pd.DatetimeIndex(SWE['DateTime_Report(UTC)']).year
SWE['month'] = pd.DatetimeIndex(SWE['DateTime_Report(UTC)']).month
SWE['day'] = pd.DatetimeIndex(SWE['DateTime_Report(UTC)']).day
# Add a column that counts the number of entries from each station
SWE['StationCounts'] = SWE.groupby(['Station_Id'])['Station_Id'].transform('count')
# throw away the stations with less than 'mincount' data points?
mincount = 100
SWE = SWE[SWE['StationCounts']>mincount]
# Make a copy of the dataframe with only unique stations for plotting on a map.
SWE_Stations = SWE.drop_duplicates('Station_Id',keep='first')
#drop all stations outside of MA
#THIS TAKES A WHILE, ONLY USED FOR SHOWING THE MAP PLOT OF STATIONS
rows_to_delete=[]
for i, row in enumerate(SWE_Stations.itertuples(), 1):
#print(i,row.Index)
coords = (SWE_Stations.iloc[i-1]['Latitude'],SWE_Stations.iloc[i-1]['Longitude'])
results = rg.search(coords)
if results[0]['admin1'] != "Massachusetts":
rows_to_delete.append(row.Index)
SWE_Stations = SWE_Stations.drop(rows_to_delete)
#plot the stations on a map
fig = px.scatter_mapbox(SWE_Stations, lon="Longitude", lat="Latitude", hover_name="Station_Id", hover_data=["Station_Id","StationCounts"],
zoom=4, height=300,color="StationCounts",color_continuous_scale=px.colors.sequential.Viridis)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print("Plot of sites in MA")
fig.show()
#plotly.io.write_image(fig,"fig1.png")
#Image('fig1.png')
#GET ANNUAL SWE MAXIMUMS (For each station)
#GET MONTHLY SWE MAXIMUMS (For each station)
#The entire row that governs is carried forward
MONTHLY_DATA=pd.DataFrame()
ANNUAL_DATA=pd.DataFrame()
#for all stations?
#for station in set(SWE_Stations['Station_Id']):
#select the station with the most data
for station in ['GRFM3']:
for yr in range(2006,2020):
tmp1 = SWE[(SWE['year']==yr) & (SWE['Station_Id']==station)]
try:
ANNUAL_DATA = ANNUAL_DATA.append(tmp1.loc[tmp1['Amount'].idxmax()])
except:
pass
for month in range(1,13):
tmp2 = SWE[(SWE['year']==yr) & (SWE['month']==month) & (SWE['Station_Id']==station)]
try:
MONTHLY_DATA = MONTHLY_DATA.append(tmp2.loc[tmp2['Amount'].idxmax()])
except:
pass
#limit monthly data to full years
MONTHLY_DATA = MONTHLY_DATA[(MONTHLY_DATA['year']>2007)]
#drop some columns no longer needed
ANNUAL_DATA = ANNUAL_DATA.drop(["StationCounts","Latitude", "Longitude"], axis=1)
MONTHLY_DATA = MONTHLY_DATA.drop(["StationCounts","Latitude", "Longitude"], axis=1)
#for the purposes of plotting put in zeros for months without data
for yr in range(2007,2020):
for month in range(1,13):
tmp = MONTHLY_DATA[(MONTHLY_DATA['year']==yr) & (MONTHLY_DATA['month']==month)]
#print(tmp.shape[0])
if tmp.shape[0]==0:
#add a zero.
#'2006-11-25 16'
dfadd = pd.DataFrame([[yr,month,15,0,str(yr)+"-"+"%02i" %month+"-01 00"]], columns = ['year','month','day','Amount','DateTime_Report(UTC)'])
MONTHLY_DATA=MONTHLY_DATA.append(dfadd)
MONTHLY_DATA = MONTHLY_DATA.sort_values(by=['DateTime_Report(UTC)'])
#Plot the data over time
fig = go.Figure()
fig.add_trace(go.Line(x=MONTHLY_DATA['DateTime_Report(UTC)'], y=MONTHLY_DATA['Amount'],
mode='lines',marker=dict(color='red',size=8),line_shape='hvh',
name='Monthly Maximum Data'))
fig.add_trace(go.Line(x=ANNUAL_DATA['DateTime_Report(UTC)'], y=ANNUAL_DATA['Amount'],
mode='lines',marker=dict(color='black',size=8),line_shape='hvh',
name='Annual Maximum Data'))
fig.update_layout(
title="Annual & Monthly Maximum SWE Data",
xaxis_title="Date",
yaxis_title="SWE (inches of water)",
template='plotly_white')
fig.show()
#plotly.io.write_image(fig,"fig2.png")
#Image('fig2.png')
#remove the zeros
ANNUAL_DATA = ANNUAL_DATA[ANNUAL_DATA.Amount>0]
MONTHLY_DATA = MONTHLY_DATA[MONTHLY_DATA.Amount>0]
swe_vals_annual = sorted(list(ANNUAL_DATA["Amount"]))
h20 = 5.202288 #psf per inch of depth
weight_vals_annual = [i*h20 for i in swe_vals_annual]
n=len(weight_vals_annual)
#try the fitter module
data_array = np.asarray(weight_vals_annual)
f = Fitter(data_array,verbose=False)
f.fit()
f.summary()
ErrorThreshold = 0.03
fig = go.Figure()
n = len(weight_vals_annual)
Pvals_annual = []
for i in range(len(sorted(weight_vals_annual))):
Pvals_annual.append(i/(n+1))
DISTRIBUTIONS = [stats.alpha,stats.anglit,stats.arcsine,stats.argus,stats.beta,stats.betaprime,stats.bradford,stats.burr,stats.burr12,stats.cauchy,stats.chi,stats.chi2,stats.cosine,stats.crystalball,stats.dgamma,stats.dweibull,stats.erlang,stats.expon,stats.exponnorm,stats.exponpow,stats.exponweib,stats.f,stats.fatiguelife,stats.fisk,stats.foldcauchy,stats.foldnorm,stats.frechet_l,stats.frechet_r,stats.gamma,stats.gausshyper,stats.genexpon,stats.genextreme,stats.gengamma,stats.genhalflogistic,stats.genlogistic,stats.gennorm,stats.genpareto,stats.gilbrat,stats.gompertz,stats.gumbel_l,stats.gumbel_r,stats.halfcauchy,stats.halfgennorm,stats.halflogistic,stats.halfnorm,stats.hypsecant,stats.invgamma,stats.invgauss,stats.invweibull,stats.johnsonsb,stats.johnsonsu,stats.kappa3,stats.kappa4,stats.ksone,stats.kstwobign,stats.laplace,stats.levy,stats.levy_l,stats.levy_stable,stats.loggamma,stats.logistic,stats.loglaplace,stats.lognorm,stats.lomax,stats.maxwell,stats.mielke,stats.moyal,stats.nakagami,stats.ncf,stats.nct,stats.ncx2,stats.norm,stats.norminvgauss,stats.pareto,stats.pearson3,stats.powerlaw,stats.powerlognorm,stats.powernorm,stats.rayleigh,stats.rdist,stats.recipinvgauss,stats.reciprocal,stats.rice,stats.rv_continuous,stats.rv_histogram,stats.semicircular,stats.skewnorm,stats.t,stats.trapz,stats.triang,stats.truncexpon,stats.truncnorm,stats.tukeylambda,stats.uniform,stats.vonmises,stats.vonmises_line,stats.wald,stats.weibull_max,stats.weibull_min,stats.wrapcauchy]
DIST_NAMES = ['alpha','anglit','arcsine','argus','beta','betaprime','bradford','burr','burr12','cauchy','chi','chi2','cosine','crystalball','dgamma','dweibull','erlang','expon','exponnorm','exponpow','exponweib','f','fatiguelife','fisk','foldcauchy','foldnorm','frechet_l','frechet_r','gamma','gausshyper','genexpon','genextreme','gengamma','genhalflogistic','genlogistic','gennorm','genpareto','gilbrat','gompertz','gumbel_l','gumbel_r','halfcauchy','halfgennorm','halflogistic','halfnorm','hypsecant','invgamma','invgauss','invweibull','johnsonsb','johnsonsu','kappa3','kappa4','ksone','kstwobign','laplace','levy','levy_l','levy_stable','loggamma','logistic','loglaplace','lognorm','lomax','maxwell','mielke','moyal','nakagami','ncf','nct','ncx2','norm','norminvgauss','pareto','pearson3','powerlaw','powerlognorm','powernorm','rayleigh','rdist','recipinvgauss','reciprocal','rice','rv_continuous','rv_histogram','semicircular','skewnorm','t','trapz','triang','truncexpon','truncnorm','tukeylambda','uniform','vonmises','vonmises_line','wald','weibull_max','weibull_min','wrapcauchy']
skips = []
for i in range(len(DIST_NAMES)):
#print(DIST_NAMES[i])
if DIST_NAMES[i] not in skips:
error = f.df_errors['sumsquare_error'][DIST_NAMES[i]]
if np.isnan(error) or error>ErrorThreshold:
visible='legendonly'
else:
visible=True
try:
name = DIST_NAMES[i]
dist = DISTRIBUTIONS[i]
params = f.fitted_param[name]
yy = x=np.linspace(0.001,0.999,98)
xx = [dist.ppf(i, *params) for i in yy]
fig.add_trace(go.Scatter(x=xx, y=yy, line=dict(width=1),hoverinfo='name+text',
mode='lines',hovertext="%.4f" % error,visible=visible,name=name))
except KeyError:
pass
fig.add_trace(go.Scatter(x=sorted(weight_vals_annual), y=Pvals_annual,
mode='markers',marker=dict(color='black',size=8),
name='data from station: GRFM3'))
fig.update_layout(
title="Fitting Distributions to Annual Maximum Data",
xaxis_title="Annual Maximum Measured Snow Weight (psf)",
yaxis_title="Probability",
template='plotly_white')
fig.update_xaxes(range=[0, 140])
fig.show()
#plotly.io.write_image(fig,"fig3.png")
#Image('fig3.png')
These don't appear to fit very well, since the fit is based on the histogram (pdf) and there isn't enough data here to make a decent histogram. So solve for optimized parameters using my own objective function that compares CDF values.
#THIS TAKES A WHILE...
ERROR={}
NEW_PARAMS={}
for i in range(len(DIST_NAMES)):
if DIST_NAMES[i] in f.fitted_param:
name = DIST_NAMES[i]
dist = DISTRIBUTIONS[i]
params = f.fitted_param[name]
sol = minimize(objfun, params, args=(dist,Pvals_annual,weight_vals_annual))
error = sol.fun
newparams = sol.x
ERROR[name] = error
NEW_PARAMS[name] = newparams
#except:
# pass
ERROR = sorted(ERROR.items(), key=operator.itemgetter(1))
ERROR = collections.OrderedDict(ERROR)
fig = go.Figure()
ErrorThreshold=0.01
skips = []
for i in range(len(DIST_NAMES)):
#try:
if True:
name = DIST_NAMES[i]
if name not in skips and name in ERROR:
error = ERROR[name]
if np.isnan(error) or error>ErrorThreshold:
visible='legendonly'
else:
visible=True
dist = DISTRIBUTIONS[i]
params = NEW_PARAMS[name]
yy = x=np.linspace(0.001,0.999,98)
xx = [dist.ppf(i, *params) for i in yy]
fig.add_trace(go.Scatter(x=xx, y=yy, line=dict(width=1),hoverinfo='name+text',
mode='lines',hovertext="%.4f" % error,visible=visible,name=name))
fig.add_trace(go.Scatter(x=sorted(weight_vals_annual), y=Pvals_annual,
mode='markers',marker=dict(color='black',size=8),
name='data from station: GRFM3'))
fig.update_layout(
title="Fitting Distributions to Annual Maximum Data",
xaxis_title="Annual Maximum Measured Snow Weight (psf)",
yaxis_title="Probability",
template='plotly_white')
fig.update_xaxes(range=[0, 140])
fig.show()
#use the best fit distributions to estimate the 50yr MRI event.
ErrorThreshold = 0.009
best_fits=[]
for i in range(len(DIST_NAMES)):
try:
if DIST_NAMES[i] not in skips:
#error = f.df_errors['sumsquare_error'][DIST_NAMES[i]]
error = ERROR[DIST_NAMES[i]]
#print()
if not np.isnan(error) and error<ErrorThreshold:
#print(DIST_NAMES[i],error)
best_fits.append(DIST_NAMES[i])
except KeyError:
pass
#print(best_fits)
print("50yr MRI Snow Load from Annual Maximums")
for fit in best_fits:
i = DIST_NAMES.index(fit)
dist = DISTRIBUTIONS[i]
params = NEW_PARAMS[fit]
yy = 0.98
xx = dist.ppf(yy, *params)
print("%20s"%fit,"\t","%.2f" %xx,"psf")
#repeat with monthly data
swe_vals_monthly = sorted(list(MONTHLY_DATA["Amount"]))
h20 = 5.202288 #psf per inch of depth
weight_vals_monthly = [i*h20 for i in swe_vals_monthly]
n=len(weight_vals_monthly)
Pvals_monthly = []
X = []
n = len(weight_vals_monthly)
for i in range(len(sorted(weight_vals_monthly))):
Pvals_monthly.append(i/(n+1))
X.append(weight_vals_monthly[i])
ERROR_monthly={}
NEW_PARAMS_monthly={}
for i in range(len(DIST_NAMES)):
try:
name = DIST_NAMES[i]
dist = DISTRIBUTIONS[i]
params = f.fitted_param[name]
sol = minimize(objfun, params, args=(dist,Pvals_monthly,weight_vals_monthly))
error = sol.fun
newparams = sol.x
ERROR_monthly[name] = error
NEW_PARAMS_monthly[name] = newparams
except:
pass
ERROR_monthly = sorted(ERROR_monthly.items(), key=operator.itemgetter(1))
ERROR_monthly = collections.OrderedDict(ERROR_monthly)
fig = go.Figure()
ErrorThreshold = 0.033
skips = []
for i in range(len(DIST_NAMES)):
name = DIST_NAMES[i]
if name not in skips and name in ERROR_monthly:
error = ERROR_monthly[name]
if np.isnan(error) or error>ErrorThreshold:
visible='legendonly'
else:
visible=True
dist = DISTRIBUTIONS[i]
params = NEW_PARAMS_monthly[name]
yy = np.linspace(0.001,0.999,98)
xx = [dist.ppf(i, *params) for i in yy]
fig.add_trace(go.Scatter(x=xx, y=yy, line=dict(width=1),hoverinfo='name+text',
mode='lines',hovertext="%.4f" % error,visible=visible,name=name))
fig.add_trace(go.Scatter(x=sorted(weight_vals_monthly), y=Pvals_monthly,
mode='markers',marker=dict(color='black',size=8),
name='data from station: GRFM3'))
fig.update_layout(
title="Fitting Distributions to Monthly Maximum Data",
xaxis_title="Monthly Maximum Measured Snow Weight (psf)",
yaxis_title="Probability",
template='plotly_white')
fig.update_xaxes(range=[0, 140])
fig.show()
#plotly.io.write_image(fig,"fig4.png")
#Image('fig4.png')
#use these distributions to estimate the 50yr MRI event.
ErrorThreshold = 0.03
best_fits=[]
for i in range(len(DIST_NAMES)):
name = DIST_NAMES[i]
if DIST_NAMES[i] not in skips and name in ERROR_monthly:
error = ERROR_monthly[name]
if not np.isnan(error) and error<ErrorThreshold:
best_fits.append(DIST_NAMES[i])
#annual
# 0.98 = 1 - (1/50)
#monthly (considering 12 months per year)
# 0.9983 = 1 - (1/(12*50)
#monthly (considering 2.4 months per year)
# 0.9917 = 1 - (1/(2.4*50)
months_per_year = MONTHLY_DATA.shape[0]/(MONTHLY_DATA['year'].max()-MONTHLY_DATA['year'].min()+1)
yy = 1 - (1/(months_per_year*50))
#print(months_per_year,yy,"\n\n")
print("\t\t50yr MRI Snow Load from Monthly Maximums")
for fit in best_fits:
i = DIST_NAMES.index(fit)
dist = DISTRIBUTIONS[i]
params = NEW_PARAMS_monthly[fit]
xx = dist.ppf(yy, *params)
print("%20s"%fit,"\t","%.2f" %xx,"psf")
These values are so high, I may have something wrong in here.
ASCE 7 ground snow load is 50psf. I'm estimating 130 psf with annual data 700+ psf with monthly data.